Setup

# Set general output paths to all analysis files
analysis.path <- "/mnt/bb_dqbm_volume/analysis"

u_stability.path <- file.path(analysis.path, "Tonsil_th152/tests/test_U_stability")
u_stability_par.path <- file.path(analysis.path, "Tonsil_th152/tests/test_U_stability_par")
# Define fnc to read in all modules and save into dataframe
read_results <- function(file, type){
  # Depending which test run different helper columns are added
  if(type=="size"){
    rep_name <- str_split(file, "/")[[1]][9]
  } else if(type=="par"){
    k_name <- str_split(file, "/")[[1]][9]
    rep_name <- str_split(file, "/")[[1]][10]
  }
      
  temp.df <- read.csv(file) 
  temp.df <- temp.df %>%
    dplyr::rename(protein=X) %>%
    dplyr::rename_with(~gsub("X", "", .x, fixed=TRUE)) %>%
    melt(id="protein") %>%
    mutate(rep=rep_name)
  
  if(type=="par"){
    temp.df <- temp.df %>%
    mutate(k=k_name)
  }
  
  temp.df
}

compare_cor <- function(A, B){
    sum(abs(A - B))
}
# Define fnc for heatmap colours
col_fun <- colorRamp2(c(0, 0.14, 0.4, 0.8), c("grey", "grey","red", "purple"))

U stability analysis

Using different training data sizes

Read in results from the stability analysis of U using different training data sizes (U_stability.ipynb) into dataframe and plot.

# Read .csv file
u_stability.df <- read.csv(file.path(u_stability.path, "results.csv"))

# Read in segmentation SCE results
u_stability_modules.path <- list.files(file.path(u_stability.path, "162624"), 'gene_modules.csv', 
                                      full.names=TRUE, recursive=TRUE)
u_stability.modules <- lapply(u_stability_modules.path, read_results, type="size") %>% bind_rows()
# Reshape correlation distances for plotting
u_stability.long <- u_stability.df %>%
  dplyr::select(-X) %>%
  dplyr::rename_with(~ gsub('X', '', .x)) %>%
  melt() %>%
  dplyr::rename(train_size=variable, distance=value) %>%
  mutate(train_size=strtoi(train_size))
## No id variables; using all as measure variables
# Plot stability of U
ggplot(u_stability.long, aes(train_size, distance)) +
  geom_point() +
  geom_smooth(method="lm", colour=pal_npg("nrc")(1)) +
  theme_cowplot() +
  labs(x="Training size", y="Correlation distance")
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 2 rows containing non-finite values (stat_smooth).
## Warning: Removed 2 rows containing missing values (geom_point).

# Create empty HeatmapList and add all folds as individual heatmaps
ht_list = NULL  
for(r in unique(u_stability.modules$rep)) {
  res.temp <- u_stability.modules %>%
    dplyr::filter(rep==r) %>%
    dplyr::select(-c("rep")) %>%
    pivot_wider(names_from = variable, values_from = value) %>%
    column_to_rownames(var="protein") %>%
    as.matrix()
  
  ht_list = ht_list + Heatmap(res.temp, column_title = paste0("Rep ", r, 
                                                              "\n(dictionary size:\n", 
                                                              ncol(res.temp), ")"),
                              col=col_fun, show_heatmap_legend=FALSE, 
                              show_row_dend=FALSE, row_names_gp=gpar(fontsize = 8),
                              column_names_gp=gpar(fontsize=8),
                              column_title_gp=gpar(fontsize=10))
}

# Draw all heatmaps together
draw(ht_list, heatmap_legend_list=list(Legend(col_fun=col_fun, title="Value")))

Using different parameters

Read in results from the stability analysis of U using different parameters (U_stability.ipynb) into dataframe and plot.

k (sparsity)

# Define path for results of different k's (sparsity)
u_stability_k.path <- file.path(u_stability_par.path, "k")

# Read in segmentation SCE results
u_stability_modules_k.path <- list.files(u_stability_k.path, 'gene_modules.csv',
                                         full.names=TRUE, recursive=TRUE)
u_stability_k.modules <- lapply(u_stability_modules_k.path, read_results, type="par") %>% bind_rows()

U matrices using different k’s

# Empty list to save correlation matrices
u_stability_k.cor <- list()

for (k_iter in unique(u_stability_k.modules$k)) {
  cat('##### k = ', k_iter, '\n')
  temp_list <-  list()
  
  # Create empty HeatmapList and add all repetitions as individual heatmaps
  ht_list = NULL
  for(r in unique(u_stability_k.modules$rep)) {
    res.temp <- u_stability_k.modules %>%
      dplyr::filter(rep==r & k==k_iter) %>%
      dplyr::select(-c("rep", "k")) %>%
      pivot_wider(names_from = variable, values_from = value) %>%
      column_to_rownames(var="protein") %>%
      as.matrix()
    
    ht_list = ht_list + Heatmap(res.temp, column_title = paste0("Rep ", r, 
                                                                "\n(dictionary size: ", 
                                                                ncol(res.temp), ")"),
                                col=col_fun, show_heatmap_legend=FALSE, 
                                show_row_dend=FALSE, row_names_gp=gpar(fontsize = 8),
                                column_names_gp=gpar(fontsize=8),
                                column_title_gp=gpar(fontsize=10))
    
    
    temp_list[[length(temp_list)+1]] <- cor(t(res.temp))
  }
  
  # Draw all heatmaps together
  draw(ht_list, heatmap_legend_list=list(Legend(col_fun=col_fun, title="Value")),
       column_title = paste0("k = ", k_iter))
  
  u_stability_k.cor[[length(u_stability_k.cor)+1]] <- temp_list
  
  cat('\n\n')
}
k = 1

k = 10

k = 2

k = 3

k = 4

k = 5

k = 6

k = 7

k = 8

k = 9

names(u_stability_k.cor) <- unique(u_stability_k.modules$k)
# Compute all pairwise distances between correlation matrices of repetions for
# each k
u_stability_k.dist <- data.frame((sapply((lapply(u_stability_k.cor, function(l){
  temp_comp <- sapply(l, function(x) sapply(l, function(y) compare_cor(x, y)))
  temp_comp[upper.tri(temp_comp)]
})), c))) %>%
  dplyr::rename_with(~ gsub("X", "", .x, fixed = TRUE)) %>%
  melt() %>%
  dplyr::rename(k=variable, distance=value)
## No id variables; using all as measure variables
u_stability_k.dist <- u_stability_k.dist %>%
  mutate(k=factor(u_stability_k.dist$k, levels=sort(as.integer(unique(u_stability_k.dist$k)))))

ggplot(u_stability_k.dist, aes(x=k, y=distance, fill=k)) +
  geom_boxplot() +
  scale_fill_npg() +
  theme_cowplot()

Mantel test of correllation between proteins for different k’s

for (n in names(u_stability_k.cor)) {
  cat('##### k = ', n, '\n')
  mantel_plot <- as.lpcor(u_stability_k.cor[[n]][[1]],
                          u_stability_k.cor[[n]][[2]],
                          u_stability_k.cor[[n]][[3]],
                          u_stability_k.cor[[n]][[4]],
                          u_stability_k.cor[[n]][[5]],
                          u_stability_k.cor[[n]][[6]],
                          u_stability_k.cor[[n]][[7]],
                          u_stability_k.cor[[n]][[8]],
                          u_stability_k.cor[[n]][[9]],
                          u_stability_k.cor[[n]][[10]]) %>%
    pairs_mantel(diag = TRUE,
                 pan.spacing = 0,
                 shape.point = 21,
                 col.point = "black",
                 fill.point = "red",
                 size.point = 1.5,
                 alpha.point = 0.6,
                 # main = "My own plot",
                 alpha = 0.2)
  print(mantel_plot)
  
  cat('\n\n')
}
k = 1

k = 10

k = 2

k = 3

k = 4

k = 5

k = 6

k = 7

k = 8

k = 9